library(phyloseq); packageVersion("phyloseq")
## [1] '1.46.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.5.1'
library(readr); packageVersion("readr")
## [1] '2.1.5'
library(tidyr); packageVersion("tidyr")
## [1] '1.3.1'
library(purrr); packageVersion("purrr")
## [1] '1.0.2'
library(furrr); packageVersion("furrr")
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
## [1] '0.3.1'
library(dplyr); packageVersion("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] '1.1.4'
library(stringr); packageVersion("stringr")
## [1] '1.5.1'
library(forcats); packageVersion("forcats")
## [1] '1.0.0'
library(metacoder); packageVersion("metacoder")
## This is metacoder version 0.3.7 (stable)
##
## Attaching package: 'metacoder'
## The following object is masked from 'package:ggplot2':
##
## map_data
## The following object is masked from 'package:phyloseq':
##
## filter_taxa
## [1] '0.3.7'
library(data.table); packageVersion("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## [1] '1.15.4'
library(decontam); packageVersion("decontam")
## [1] '1.22.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:metacoder':
##
## reverse
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:metacoder':
##
## complement
## The following object is masked from 'package:base':
##
## strsplit
## [1] '2.70.3'
library(magick); packageVersion("magick")
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
## [1] '2.8.4'
library(vegan); packageVersion("vegan")
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## [1] '2.6.6.1'
library(pdftools);packageVersion("pdftools")
## Warning: package 'pdftools' was built under R version 4.3.3
## Using poppler version 23.04.0
## [1] '3.4.1'
library(vegan); packageVersion("vegan")
## [1] '2.6.6.1'
library(grid)
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
knitr::opts_knit$set(root.dir = "~/benchmark_demulticoder")
seed <- 1
set.seed(seed)
asv_matrix_rps10<-read.csv("demulticoder/data/final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10$dada2_tax <- asv_matrix_rps10$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10$dada2_tax)
asv_matrix_rps10 <- asv_matrix_rps10[, -1]
colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)])
asv_matrix_its<-read.csv("demulticoder/data/final_asv_abundance_matrix_its.csv")
asv_matrix_its$dada2_tax <- gsub("Fungi", "Eukaryota--100--Domain;Fungi", asv_matrix_its$dada2_tax)
asv_matrix_its <- asv_matrix_its[, -1]
colnames(asv_matrix_its)[3:ncol(asv_matrix_rps10)] <- gsub("_its$", "", colnames(asv_matrix_its)[3:ncol(asv_matrix_its)])
#Let's combine these matrices
#For easier analysis, we previously combined the two matrices, and appended domain info to each one so we can make one heat tree for combined dataset
sample_cols_its <- setdiff(names(asv_matrix_its), c("sequence", "dada2_tax"))
sample_cols_rps10 <- setdiff(names(asv_matrix_rps10), c("sequence", "dada2_tax"))
if(!all(sample_cols_its == sample_cols_rps10)) {
stop("Sample columns do not match between ITS and RPS10 dataframes!")
}
abundance <- rbind(
asv_matrix_its[, c("sequence", "dada2_tax", sample_cols_its)], # ITS data
asv_matrix_rps10[, c("sequence", "dada2_tax", sample_cols_rps10)] # RPS10 data
)
write.csv(abundance, "demulticoder/data/final_asv_abundance_matrix_combined_demulticoder.csv")
metadata_path <- file.path("demulticoder/data/metadata.csv")
metadata <- read_csv(metadata_path)
## Rows: 174 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_name, well, organism, sample_type
## dbl (3): plate, path_conc, experiment
## lgl (2): flooded, is_ambiguous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(metadata)
## # A tibble: 174 × 9
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <chr> <lgl> <dbl> <dbl> <chr>
## 1 S1 1 A01 Cry TRUE 100 1 Sample
## 2 S10 1 B02 Cry TRUE 1 1 Sample
## 3 S100 2 B02 Cin TRUE 1 2 Sample
## 4 S101 2 C02 Plu FALSE 1 2 Sample
## 5 S102 2 D02 Cry TRUE 1 2 Sample
## 6 S103 2 F02 Cin FALSE 1 2 Sample
## 7 S104 2 G02 Plu FALSE 1 2 Sample
## 8 S105 2 H02 Cry TRUE 100 2 Sample
## 9 S106 2 A03 Plu FALSE 100 2 Sample
## 10 S107 2 B03 Cin TRUE 100 2 Sample
## # ℹ 164 more rows
## # ℹ 1 more variable: is_ambiguous <lgl>
#Fourth, let's track reads
track_reads_demulticoder_its<-read.csv("demulticoder/data/track_reads_its.csv", row.names = 1)
track_reads_demulticoder_rps10<-read.csv("demulticoder/data/track_reads_rps10.csv", row.names = 1)
its_summary_reads<-summary(track_reads_demulticoder_its)
print(its_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 82 Min. : 1 Min. : 1 Min. : 1
## 1st Qu.: 23440 1st Qu.:12671 1st Qu.:12375 1st Qu.:12308
## Median : 37507 Median :23008 Median :22684 Median :22513
## Mean : 42989 Mean :25466 Mean :25212 Mean :25078
## 3rd Qu.: 57640 3rd Qu.:31912 3rd Qu.:31614 3rd Qu.:31508
## Max. :136509 Max. :87421 Max. :87247 Max. :87170
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.:11794 1st Qu.:11794
## Median :21770 Median :21770
## Mean :24322 Mean :24230
## 3rd Qu.:30406 3rd Qu.:30050
## Max. :86880 Max. :86868
rps10_summary_reads<-summary(track_reads_demulticoder_rps10)
print(rps10_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 91 Min. : 26 Min. : 7 Min. : 6
## 1st Qu.: 37994 1st Qu.: 29199 1st Qu.: 29186 1st Qu.: 29184
## Median : 61070 Median : 48085 Median : 47950 Median : 48060
## Mean : 73511 Mean : 56500 Mean : 56435 Mean : 56433
## 3rd Qu.: 95487 3rd Qu.: 74825 3rd Qu.: 74408 3rd Qu.: 74436
## Max. :263101 Max. :219382 Max. :218997 Max. :219291
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.: 29172 1st Qu.: 29172
## Median : 47653 Median : 47653
## Mean : 56030 Mean : 55349
## 3rd Qu.: 73485 3rd Qu.: 73400
## Max. :218521 Max. :217372
# Function to extract taxonomic levels and count unique entries with bootstrap filtering
# Function to extract taxonomic levels and count unique entries
analyze_taxonomy <- function(data) {
# Get taxonomy data
tax_data <- data$dada2_tax
# Split taxonomy string into components
tax_splits <- strsplit(tax_data, ";")
# Function to safely extract taxonomic level with proper rank checking
extract_tax_level <- function(tax_array, rank) {
# Find the position containing the rank
rank_pos <- grep(rank, tax_array)
if(length(rank_pos) > 0) {
# Extract and clean the taxonomy name
tax <- tax_array[rank_pos]
# Remove the rank pattern and any leading/trailing whitespace
cleaned_tax <- trimws(gsub(paste0("--\\d+--", rank), "", tax))
return(cleaned_tax)
}
return(NA)
}
# Extract each taxonomic level with proper hierarchy checking
families <- sapply(tax_splits, function(x) extract_tax_level(x, "Family"))
genera <- sapply(tax_splits, function(x) extract_tax_level(x, "Genus"))
species <- sapply(tax_splits, function(x) extract_tax_level(x, "Species"))
# Create a data frame to check hierarchy consistency
tax_df <- data.frame(
Family = families,
Genus = genera,
Species = species,
stringsAsFactors = FALSE
)
# Remove NA values before counting
families <- families[!is.na(families)]
genera <- genera[!is.na(genera)]
species <- species[!is.na(species)]
# Count unique entries
unique_counts <- list(
families = length(unique(families)),
genera = length(unique(genera)),
species = length(unique(species))
)
# Get prevalence with hierarchy checking
family_prev <- table(families)
genus_prev <- table(genera)
species_prev <- table(species)
# Sort prevalence tables
family_prev <- sort(family_prev, decreasing = TRUE)
genus_prev <- sort(genus_prev, decreasing = TRUE)
species_prev <- sort(species_prev, decreasing = TRUE)
# Print top 5 of each level for verification
cat("\nTop 5 most prevalent Families:\n")
print(head(family_prev, 5))
cat("\nTop 5 most prevalent Genera:\n")
print(head(genus_prev, 5))
cat("\nTop 5 most prevalent Species:\n")
print(head(species_prev, 5))
# Get most prevalent taxa with hierarchy verification
most_prevalent <- list(
family = if(length(family_prev) > 0) names(family_prev)[1] else "None found",
genus = if(length(genus_prev) > 0) names(genus_prev)[1] else "None found",
species = if(length(species_prev) > 0) names(species_prev)[1] else "None found"
)
return(list(
unique_counts = unique_counts,
most_prevalent = most_prevalent,
tax_df = tax_df # Return full taxonomy dataframe for inspection
))
}
# Run analysis for both datasets
if(exists("asv_matrix_rps10")) {
rps10_results <- analyze_taxonomy(asv_matrix_rps10)
}
##
## Top 5 most prevalent Families:
## families
## Pythiaceae Peronosporaceae Saprolegniaceae Saplolegniaceae Lagenidiaceae
## 171 126 20 10 9
##
## Top 5 most prevalent Genera:
## genera
## Pythium Phytophthora Aphanomyces Peronospora Saprolegnia
## 152 96 18 16 11
##
## Top 5 most prevalent Species:
## species
## Phytophthora_sp.kununurra Phytophthora_citrophthora Pythium_lutarium
## 23 19 19
## Pythium_dissotocum Phytophthora_sojae
## 14 13
if(exists("asv_matrix_its")) {
its_results <- analyze_taxonomy(asv_matrix_its)
}
##
## Top 5 most prevalent Families:
## families
## Fungi_fam_Incertae_sedis Serendipitaceae
## 366 200
## Rozellomycota_fam_Incertae_sedis Hyaloscyphaceae
## 176 157
## Tuberaceae
## 128
##
## Top 5 most prevalent Genera:
## genera
## Fungi_gen_Incertae_sedis Serendipita
## 366 196
## Rozellomycota_gen_Incertae_sedis Tuber
## 176 128
## Hyaloscypha
## 94
##
## Top 5 most prevalent Species:
## species
## Fungi_sp Serendipita_sp Rozellomycota_sp
## 366 190 176
## Tuber_pseudobrumale Archaeorhizomyces_sp
## 128 48
First we will configure the combined taxmap object
#load reference database
rps10_database <- read_fasta("demulticoder/data/rps10_reference_db.fa")
innoc_species <- c(Cin = "Phytophthora_cinnamomi", Plu = "Phytophthora_plurivora", Cry = "Pythium_cryptoirregulare")
innoc_regex <- paste0('(', paste0(innoc_species, collapse = '|'), ')')
innoc_seqs <- rps10_database[grepl(names(rps10_database), pattern = innoc_regex)]
innoc_seqs <- innoc_seqs[! duplicated(innoc_seqs)]
names(innoc_seqs) <- str_match(names(innoc_seqs), pattern = innoc_regex)[, 2]
iupac_match <- function(asv_chars, ref_chars) {
map2_lgl(asv_chars, ref_chars, function(asv, ref) grepl(asv, pattern = paste0('[', IUPAC_CODE_MAP[ref], ']+')))
}
# Count number of mismatches in an alignment, allowing for IUPAC codes in reference
align_mismatch <- function(alignment) {
asv_chars <- strsplit(as.character(alignment@pattern), '')[[1]]
ref_chars <- strsplit(as.character(alignment@subject), '')[[1]]
sum(! iupac_match(asv_chars, ref_chars))
}
# Align each sequence to each asv and make table of results"
aligned_data_path <- file.path("demulticoder/data", "infection_aligned_data.rds")
if (file.exists(aligned_data_path)) {
aligned_data <- readRDS(aligned_data_path)
} else {
aligned_data <- future_map_dfr(seq_along(innoc_seqs), function(i) {
aligned <- lapply(abundance$sequence, function(asv) pairwiseAlignment(pattern = asv, subject = innoc_seqs[i], type = 'global-local'))
print(paste("Processing species:", names(innoc_seqs)[i]))
tibble(
species = names(innoc_seqs)[i],
align_len = map_dbl(aligned, nchar),
mismatch = map_dbl(aligned, align_mismatch),
pid = (align_len - mismatch) / align_len,
asv_seq = abundance$sequence,
ref_seq = innoc_seqs[i],
alignment = aligned
)
})
saveRDS(aligned_data, file = aligned_data_path)
}
# Convert ASV abundances to proportions for use with matches
abundance_prop <- abundance
abundance_prop[metadata$sample_name] <- lapply(abundance_prop[metadata$sample_name], function(counts) counts/sum(counts))
innoculum_asv_mismatch_threshold <-1
# Sum the read counts of ASVs representing the same species used for inoculation
infection_data <- aligned_data %>%
filter(mismatch <= innoculum_asv_mismatch_threshold) %>%
left_join(abundance_prop, by = c(asv_seq = "sequence")) %>%
group_by(asv_seq) %>% slice_min(mismatch, with_ties = FALSE) # Only consider the best match per ASV
## Update abundance matrix and metadata with results
inoculum_asv_key <- setNames(infection_data$species, infection_data$asv_seq)
abundance <- mutate(abundance, is_inoculum = sequence %in% infection_data$asv_seq, inoculum = inoculum_asv_key[sequence], .after = "dada2_tax")
write_csv(abundance, file.path('demulticoder/data', 'abundance_with_infection_data.csv'))
#Let's append metadata table with more info
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
# Get per-sample proportions of reads
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
tidyr::gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
metadata <- filter(metadata, ! is_ambiguous)
abundance <- filter(abundance, ! is_inoculum)
#Filter out mock community samples and also any mock community samples as well
metadata <- filter(metadata, sample_type != "Mock community" & sample_type != "Negative control")
obj <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj$data) <- c('abund', 'score')
obj <- transmute_obs(obj, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.
metadata$raw_count <- colSums(obj$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 12676
obj$data$prop <- calc_obs_props(obj, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj$data$prop <- zero_low_counts(obj, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 2166 of 438534 counts less than 7.88892395077311e-05.
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S1 S10 S100 S101 S102 S103 S104 S105
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0821 0.101 0.00142 0.0396 0.0345 1.90e-3 8.63e-4 1.68e-2
## 2 bgb 0.00867 0.0190 0.0154 0.00757 0.00565 2.95e-2 3.11e-2 8.06e-3
## 3 bgc 0.134 0 0 0.0316 0 0 0 0
## 4 bgd 0.00482 0 0 0.0694 0.000199 4.26e-4 1.96e-2 2.74e-3
## 5 bge 0.00584 0.00589 0.00680 0.0121 0.00495 7.28e-3 2.41e-2 4.03e-3
## 6 bgf 0.000106 0.0137 0.00337 0.0190 0.00635 1.62e-2 5.25e-3 8.30e-3
## 7 bgg 0.000158 0.000228 0.000778 0.000188 0.000350 4.55e-4 1.23e-3 4.25e-4
## 8 bgh 0.00322 0.0271 0.00405 0.00110 0.00304 2.50e-3 6.02e-3 1.68e-3
## 9 bgc 0 0 0 0 0 0 0 0
## 10 bgc 0 0 0.00461 0 0 0 0 0
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S106 <dbl>, S107 <dbl>, S108 <dbl>, S109 <dbl>,
## # S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>, S114 <dbl>,
## # S115 <dbl>, S116 <dbl>, S117 <dbl>, S118 <dbl>, S119 <dbl>, S12 <dbl>,
## # S120 <dbl>, S121 <dbl>, S122 <dbl>, S123 <dbl>, S124 <dbl>, S125 <dbl>,
## # S126 <dbl>, S127 <dbl>, S128 <dbl>, S129 <dbl>, S13 <dbl>, S130 <dbl>,
## # S131 <dbl>, S132 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S136 <dbl>, …
obj$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
out <- obj$data$prop[[id]]
out[is.na(out) | is.nan(out)] <- 0
out
})
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S1 S10 S100 S101 S102 S103 S104 S105
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0821 0.101 0.00142 0.0396 0.0345 1.90e-3 8.63e-4 1.68e-2
## 2 bgb 0.00867 0.0190 0.0154 0.00757 0.00565 2.95e-2 3.11e-2 8.06e-3
## 3 bgc 0.134 0 0 0.0316 0 0 0 0
## 4 bgd 0.00482 0 0 0.0694 0.000199 4.26e-4 1.96e-2 2.74e-3
## 5 bge 0.00584 0.00589 0.00680 0.0121 0.00495 7.28e-3 2.41e-2 4.03e-3
## 6 bgf 0.000106 0.0137 0.00337 0.0190 0.00635 1.62e-2 5.25e-3 8.30e-3
## 7 bgg 0.000158 0.000228 0.000778 0.000188 0.000350 4.55e-4 1.23e-3 4.25e-4
## 8 bgh 0.00322 0.0271 0.00405 0.00110 0.00304 2.50e-3 6.02e-3 1.68e-3
## 9 bgc 0 0 0 0 0 0 0 0
## 10 bgc 0 0 0.00461 0 0 0 0 0
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S106 <dbl>, S107 <dbl>, S108 <dbl>, S109 <dbl>,
## # S11 <dbl>, S110 <dbl>, S111 <dbl>, S112 <dbl>, S113 <dbl>, S114 <dbl>,
## # S115 <dbl>, S116 <dbl>, S117 <dbl>, S118 <dbl>, S119 <dbl>, S12 <dbl>,
## # S120 <dbl>, S121 <dbl>, S122 <dbl>, S123 <dbl>, S124 <dbl>, S125 <dbl>,
## # S126 <dbl>, S127 <dbl>, S128 <dbl>, S129 <dbl>, S13 <dbl>, S130 <dbl>,
## # S131 <dbl>, S132 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S136 <dbl>, …
#Let's modify the metadata sheet
metadata <- metadata %>%
mutate(organism = fct_relevel(ordered(organism), "Control", "Cin", "Plu", "Cry"))
abund_table <- obj$data$abund[metadata$sample_name]
metadata$richness <- vegan::specnumber(abund_table, MARGIN = 2)
metadata$shannon <- vegan::diversity(abund_table, MARGIN = 2, index = "shannon")
metadata$invsimpson <- vegan::diversity(abund_table, MARGIN = 2, index = "invsimpson")
write.csv(metadata, file.path("demulticoder", "results/alpha_diversity.csv"))
print(metadata)
## # A tibble: 162 × 21
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <ord> <lgl> <dbl> <dbl> <chr>
## 1 S1 1 A01 Cry TRUE 100 1 Sample
## 2 S10 1 B02 Cry TRUE 1 1 Sample
## 3 S100 2 B02 Cin TRUE 1 2 Sample
## 4 S101 2 C02 Plu FALSE 1 2 Sample
## 5 S102 2 D02 Cry TRUE 1 2 Sample
## 6 S103 2 F02 Cin FALSE 1 2 Sample
## 7 S104 2 G02 Plu FALSE 1 2 Sample
## 8 S105 2 H02 Cry TRUE 100 2 Sample
## 9 S106 2 A03 Plu FALSE 100 2 Sample
## 10 S107 2 B03 Cin TRUE 100 2 Sample
## # ℹ 152 more rows
## # ℹ 13 more variables: is_ambiguous <lgl>, cin_prop <dbl>, cry_prop <dbl>,
## # plu_prop <dbl>, expected_innoc_prop <dbl>, unexpected_innoc_prop <dbl>,
## # expected_innoc <lgl>, only_expected_innoc <lgl>, valid_inoc <lgl>,
## # raw_count <dbl>, richness <int>, shannon <dbl>, invsimpson <dbl>
plotted_factors <- c('Organism' = 'organism', 'Flooded' = 'flooded', 'Pathogen Concentration' = 'path_conc', 'Trial' = 'experiment')
# Reformat data for plotting
alpha_plot_data <- plotted_factors %>%
map2_dfr(names(plotted_factors),
function(factor, factor_name) {
out <- metadata
out$factor <- factor_name
out$value <- as.character(metadata[[factor]])
return(out)
}) %>%
mutate(path_conc = factor(path_conc,
levels = sort(unique(path_conc)),
labels = paste(sort(unique(path_conc)), 'CFU/g'),
ordered = TRUE)) %>%
filter(sample_type == 'Sample') %>%
select(sample_name, factor, value, invsimpson) %>%
tidyr::gather(key = "index", value = "diversity", -sample_name, -factor, -value) %>%
mutate(value = forcats::fct_relevel(ordered(value), "Control", "Cin", "Plu", "Cry"))
# ANOVA and Tukey's HSD
anova_and_hsd <- function(x) {
anova_result <- aov(diversity ~ value, x)
tukey_result <- agricolae::HSD.test(anova_result, "value", group = TRUE)
group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
group_key <- setNames(group_data$groups, rownames(group_data))
group_key[as.character(x$value)]
}
alpha_plot_data$group <- unlist(map(split(alpha_plot_data, alpha_plot_data$factor)[unique(alpha_plot_data$factor)], anova_and_hsd))
alpha_subplot <- ggplot(alpha_plot_data, aes(x = value, y = diversity)) +
geom_boxplot() +
geom_text(aes(x = value,
y = max(diversity) + 2,
label = group),
col = 'black',
size = 5) +
facet_grid( ~ factor, scales = "free") +
labs(x = NULL, y = 'Diversity (Inverse Simpson)') +
guides(color = "none") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position="bottom")
set.seed(1)
prob_table <- obj$data$prop[metadata$sample_name]
nmds_plot_data <- function(prob_table) {
metadata <- metadata[metadata$sample_name %in% colnames(prob_table), ]
set.seed(1)
nmds_results <- vegan::metaMDS(t(prob_table), trymax = 1000, k = 2, trace = 0)
nmds_data <- nmds_results$points %>%
as_tibble() %>%
bind_cols(metadata)
names(nmds_data)[1:2] <- paste0("NMDS", 1:2)
return(nmds_data)
}
nmds_data <- nmds_plot_data(prob_table[! is.na(metadata$organism) & metadata$valid_inoc])
nmds_factors <- c(Flooded = 'flooded', Organism = 'organism', 'Pathogen CFU/g' = 'path_conc', 'Trial' = 'experiment')
make_one_plot <- function(factor, name) {
nmds_data %>%
mutate(factor = as.character(nmds_data[[factor]]),
NMDS1 = scales::rescale(NMDS1),
NMDS2 = scales::rescale(NMDS2)) %>%
mutate(factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu", "Cry")) %>%
ggplot(aes_string(x = "NMDS1", y = "NMDS2", color = "factor", label = "sample_name")) +
# geom_label(size = 2) +
geom_point() +
coord_fixed() +
viridis::scale_color_viridis(discrete = TRUE, end = .9) +
labs(color = NULL, x = NULL, y = NULL) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 7),
axis.ticks = element_blank(),
plot.margin = unit(rep(0.04, 4), "cm"),
# panel.background = element_rect(fill = 'transparent', colour = NA),
# plot.background = element_rect(fill = "white", colour = NA),
legend.position = "bottom",
legend.text = element_text(size = 10),
legend.key.height = unit(0.5, 'cm'),
legend.key.width = unit(0.2, 'cm'))
}
nmds_subplots <- map2(nmds_factors, names(nmds_factors), make_one_plot)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
nmds_plot <- ggpubr::ggarrange(plotlist = c(list(ggplot() + theme_void()), nmds_subplots),
nrow = 1, widths = c(0.15, 1, 1, 1, 1))
ggsave(nmds_plot, path = "demulticoder/figures", filename = "nmds.pdf",
width = 7, height = 8)
print(nmds_plot)
combined_div_plot <- ggpubr::ggarrange(alpha_subplot, nmds_plot, ncol = 1, labels = c('A', 'B'),
heights = c(1, 1))
combined_div_plot
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.pdf",
width = 10, height = 6, bg = "#FFFFFF")
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.svg",
width = 10, height = 6, bg = "#FFFFFF")
dist_matrix <- vegdist(t(prob_table), method = "bray")
adonis2_full <- adonis2(dist_matrix ~ organism + flooded + path_conc + experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism + flooded + path_conc + experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.3900 0.058 .
## flooded 1 1.848 0.03514 6.1469 0.001 ***
## path_conc 1 0.276 0.00524 0.9165 0.517
## experiment 1 2.614 0.04971 8.6953 0.001 ***
## Residual 155 46.598 0.88608
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_org <- adonis2(dist_matrix ~ organism,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_org)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.2861 0.095 .
## Residual 158 51.336 0.97616
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_flooded <- adonis2(dist_matrix ~ flooded,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_flooded)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ flooded, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## flooded 1 1.849 0.03515 5.8294 0.001 ***
## Residual 160 50.741 0.96485
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_path_conc <- adonis2(dist_matrix ~ path_conc,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_path_conc)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ path_conc, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## path_conc 1 0.308 0.00586 0.9431 0.46
## Residual 160 52.281 0.99414
## Total 161 52.590 1.00000
adonis2_experiment <- adonis2(dist_matrix ~ experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_experiment)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## experiment 1 2.613 0.04969 8.3655 0.001 ***
## Residual 160 49.977 0.95031
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Make sure our factors are input properly as factors
metadata$path_conc <- as.factor(metadata$path_conc)
metadata$path_conc<- factor(metadata$path_conc, levels = c("0", "1", "100"))
metadata$organism <- as.factor(metadata$organism)
metadata$organism<- factor(metadata$organism, levels = c("Control", "Cin", "Cry", "Plu"))
metadata$flooded <- as.factor(metadata$flooded)
metadata$flooded <- factor(metadata$flooded, levels = c("TRUE", "FALSE"))
metadata$experiment <- as.factor(metadata$experiment)
metadata$experiment<- factor(metadata$experiment, levels = c("1", "2"))
abundance$is_low_abund <- rowSums(abundance[, metadata$sample_name]) < 5
obj2 <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj2$data) <- c('abund', 'score')
obj2 <- transmute_obs(obj2, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
print(obj2)
## <Taxmap>
## 1573 taxa: aab. Eukaryota, aac. Fungi ... cin. Fungi_sp
## 1573 edges: NA->aab, aab->aac, aab->aad ... cil->cim, cim->cin
## 2 data sets:
## abund:
## # A tibble: 2,707 × 180
## taxon_id sequence dada2_tax is_inoculum inoculum S1 S10
## <chr> <chr> <chr> <lgl> <chr> <int> <int>
## 1 bga AAGTCGTAA… Eukaryot… FALSE <NA> 4670 3565
## 2 bgb AAGTCGTAA… Eukaryot… FALSE <NA> 493 668
## 3 bgc AAAAAGTCG… Eukaryot… FALSE <NA> 7603 0
## # ℹ 2,704 more rows
## # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
## # S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
## # S108 <int>, S109 <int>, …
## score:
## # A tibble: 23,852 × 4
## taxon_id sequence boot rank
## <chr> <chr> <chr> <chr>
## 1 aab AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 Doma…
## 2 aac AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 King…
## 3 aae AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73 Phyl…
## # ℹ 23,849 more rows
## 0 functions:
obj2$data$tax_abund <- calc_taxon_abund(obj2, data = "abund", cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1573 taxa
obj2$data$abund_prop <- calc_obs_props(obj2, data = "abund", cols = metadata$sample_name, groups = rep("tax_prop", nrow(metadata)))
## Calculating proportions from counts for 162 columns in 1 groups for 2707 observations.
obj2$data$tax_prop <- calc_taxon_abund(obj2, data = "abund_prop", cols = "tax_prop")
## Summing per-taxon counts from 1 columns for 1573 taxa
obj2$data$abund_prop <- NULL
obj_subset <- obj2 %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
min_bootstrap <- 0
obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
## [1] 2.11993e-11 1.00000e+00
## [1] -9.832517 29.140832
## [1] 3.934637e-21 9.957609e-01
## [1] -24.80020 5.15338
## [1] 7.212696e-51 1.000000e+00
## [1] -42.16813 43.59079
## [1] 1.807349e-190 9.952921e-01
## [1] -26.94927 25.83220